{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Vwou_l883KAy"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "!{sys.executable} -m pip install pandas numpy scipy --quiet"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5ObfMTq3j8jC",
        "outputId": "1edea750-e69e-4776-a2ec-61759ddd64db"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "==============================================\n",
            "KRUSKAL–WALLIS TEST (Reporting Companies)\n",
            "==============================================\n",
            "Industries (k): 12\n",
            "Degrees of freedom: 11\n",
            "N: 136\n",
            "H statistic: 44.4650\n",
            "Exact p-value: 6.0241077e-06\n",
            "Conclusion: p < 0.0001\n",
            "\n",
            "==============================================\n",
            "FISHER'S EXACT TEST (Reporting vs Revenue)\n",
            "==============================================\n",
            "Contingency Table:\n",
            "                 Reporting   Not Reporting\n",
            "Revenue < $2B             15               42\n",
            "Revenue ≥ $2B            122               71\n",
            "Total                    137              113\n",
            "\n",
            "Odds Ratio: 0.207845\n",
            "Exact p-value: 9.2247731e-07\n",
            "Conclusion: p < 0.0001\n"
          ]
        }
      ],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "from scipy.stats import kruskal, fisher_exact\n",
        "\n",
        "# -----------------------------\n",
        "# LOAD DATA\n",
        "# -----------------------------\n",
        "DATA_PATH = \"/content/data.xlsx\"  # change if needed\n",
        "\n",
        "if DATA_PATH.lower().endswith((\".xlsx\", \".xls\")):\n",
        "    df = pd.read_excel(DATA_PATH)\n",
        "else:\n",
        "    try:\n",
        "        df = pd.read_csv(DATA_PATH)\n",
        "    except UnicodeDecodeError:\n",
        "        df = pd.read_csv(DATA_PATH, encoding=\"latin1\")\n",
        "\n",
        "# -----------------------------\n",
        "# CLEAN NUMERIC FIELDS\n",
        "# -----------------------------\n",
        "def to_num(series):\n",
        "    return pd.to_numeric(series.astype(str).str.replace(\",\", \"\", regex=False), errors=\"coerce\")\n",
        "\n",
        "df[\"RevenuesGhgCo\"] = to_num(df[\"RevenuesGhgCo\"])\n",
        "df[\"Scope1+2Total\"] = to_num(df[\"Scope1+2Total\"])\n",
        "df[\"SasbIndustry\"] = df[\"SasbIndustry\"].astype(str).str.strip()\n",
        "\n",
        "# -----------------------------\n",
        "# DEFINE REPORTING\n",
        "# -----------------------------\n",
        "df[\"is_reporting\"] = df[\"Scope1+2Total\"].notna() & df[\"RevenuesGhgCo\"].notna() & (df[\"RevenuesGhgCo\"] > 0)\n",
        "\n",
        "# ==========================================================\n",
        "# 1) KRUSKAL–WALLIS (Reporting Companies Only)\n",
        "# ==========================================================\n",
        "df_rep = df[df[\"is_reporting\"]].copy()\n",
        "df_rep[\"Intensity\"] = df_rep[\"Scope1+2Total\"] / df_rep[\"RevenuesGhgCo\"]\n",
        "\n",
        "counts = df_rep[\"SasbIndustry\"].value_counts()\n",
        "valid_inds = counts[counts >= 2].index\n",
        "df_kw = df_rep[df_rep[\"SasbIndustry\"].isin(valid_inds)]\n",
        "\n",
        "k = df_kw[\"SasbIndustry\"].nunique()\n",
        "n = len(df_kw)\n",
        "\n",
        "groups = [g[\"Intensity\"].values for _, g in df_kw.groupby(\"SasbIndustry\")]\n",
        "H, p_kw = kruskal(*groups)\n",
        "\n",
        "print(\"==============================================\")\n",
        "print(\"KRUSKAL–WALLIS TEST (Reporting Companies)\")\n",
        "print(\"==============================================\")\n",
        "print(f\"Industries (k): {k}\")\n",
        "print(f\"Degrees of freedom: {k-1}\")\n",
        "print(f\"N: {n}\")\n",
        "print(f\"H statistic: {H:.4f}\")\n",
        "print(f\"Exact p-value: {p_kw:.8g}\")\n",
        "\n",
        "if p_kw < 0.0001:\n",
        "    print(\"Conclusion: p < 0.0001\")\n",
        "else:\n",
        "    print(\"Conclusion: p ≥ 0.0001\")\n",
        "\n",
        "# ==========================================================\n",
        "# 2) FISHER'S EXACT TEST (Table 3)\n",
        "# ==========================================================\n",
        "REV_THRESHOLD_M = 2000  # $2B threshold (if revenues are in millions)\n",
        "\n",
        "df_f = df[df[\"RevenuesGhgCo\"].notna() & (df[\"RevenuesGhgCo\"] > 0)].copy()\n",
        "df_f[\"rev_ge_2b\"] = df_f[\"RevenuesGhgCo\"] >= REV_THRESHOLD_M\n",
        "\n",
        "a = int((~df_f[\"rev_ge_2b\"] & df_f[\"is_reporting\"]).sum())\n",
        "b = int((~df_f[\"rev_ge_2b\"] & ~df_f[\"is_reporting\"]).sum())\n",
        "c = int(( df_f[\"rev_ge_2b\"] & df_f[\"is_reporting\"]).sum())\n",
        "d = int(( df_f[\"rev_ge_2b\"] & ~df_f[\"is_reporting\"]).sum())\n",
        "\n",
        "table = np.array([[a, b],\n",
        "                  [c, d]])\n",
        "\n",
        "print(\"\\n==============================================\")\n",
        "print(\"FISHER'S EXACT TEST (Reporting vs Revenue)\")\n",
        "print(\"==============================================\")\n",
        "print(\"Contingency Table:\")\n",
        "print(\"                 Reporting   Not Reporting\")\n",
        "print(f\"Revenue < $2B     {a:10d}   {b:14d}\")\n",
        "print(f\"Revenue ≥ $2B     {c:10d}   {d:14d}\")\n",
        "print(f\"Total             {a+c:10d}   {b+d:14d}\")\n",
        "\n",
        "if (b + d) == 0:\n",
        "    print(\"\\nCannot compute Fisher’s Exact — dataset contains no non-reporters.\")\n",
        "else:\n",
        "    oddsratio, p_fisher = fisher_exact(table, alternative=\"two-sided\")\n",
        "    print(f\"\\nOdds Ratio: {oddsratio:.6g}\")\n",
        "    print(f\"Exact p-value: {p_fisher:.8g}\")\n",
        "\n",
        "    if p_fisher < 0.0001:\n",
        "        print(\"Conclusion: p < 0.0001\")\n",
        "    else:\n",
        "        print(\"Conclusion: p ≥ 0.0001\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
